package cn.doitedu.profile.ml.comment

import java.util

import com.hankcs.hanlp.HanLP
import com.hankcs.hanlp.seg.common.Term
import org.apache.spark.ml.classification.{NaiveBayes, NaiveBayesModel}
import org.apache.spark.ml.feature.{HashingTF, IDF}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

/**
 * @author 涛哥
 * @nick_name "deep as the sea"
 * @contact qq:657270652 wx:doit_edu
 * @site www.doitedu.cn
 * @date 2021-06-23
 * @desc 语义评论分类器
 */
object CommentClassify {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession.builder()
      .config("spark.sql.shuffle.partitions","1")
      .appName("")
      .master("local")
      .getOrCreate()

    import spark.implicits._


    // 加载训练集、测试集
    val ds0 = spark.read.textFile("C:\\Users\\83544\\Desktop\\doit21-datayi-day13\\comment\\poor").map(s=>(0.0,s))
    val ds1 = spark.read.textFile("C:\\Users\\83544\\Desktop\\doit21-datayi-day13\\comment\\general").map(s=>(1.0,s))
    val ds2 = spark.read.textFile("C:\\Users\\83544\\Desktop\\doit21-datayi-day13\\comment\\good").map(s=>(2.0,s))

    // 合并样本数据
    val sample = ds0.union(ds1).union(ds2)

    // 分词
    val wordDF = sample.map(tp=>{
      val label = tp._1
      val comment = tp._2

      import scala.collection.JavaConversions._
      val terms: util.List[Term] = HanLP.segment(comment)
      (label,terms.map(t=>t.word).toArray)
      }).toDF("label","words")
    /**
    wordDF.show(100,true)
     * +---+-----------------------------+
     * | label|                    words|
     * +---+-----------------------------+
     * |0.0|  [拍照, 太, 假, ，, 手机,...|
     * |0.0|   [手机, 买了, 1, 个, 多,...|
     * |0.0|  [刚, 收到, 手机, ，, 拍,...|
     * |0.0|     [3099, 买, 的, 两, 天...|
     * |0.0| [触摸屏, 不, 是, 很好, 用...|
     * |0.0| [质量, 和, 耐克, 比, 差, ，]|
     */

    // 向量化
    // 构造一个hash词映射算法
    val hashingTF = new HashingTF()
      .setNumFeatures(100000)
      .setInputCol("words")
      .setOutputCol("tf_vec")
    // 用算法，将词数组，变成tf值词向量
    val tfVecDF = hashingTF.transform(wordDF).drop("words")

    // 构造一个idf算法
    val idf = new IDF()
      .setInputCol("tf_vec")
      .setOutputCol("tfidf_vec")

    // 对上面的词向量训练idf模型
    val iDFModel = idf.fit(tfVecDF)
    // 用模型，对上面的词向量加工成tfidf特征值向量  []
    val tfIdfDF = iDFModel.transform(tfVecDF).drop("tf_vec")
    /**
     * tfIdfDF.show(50,false)
     * +-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
     * |label|tfidf_vec                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |
     * +-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
     * |0.0  |(100000,[398,486,6823,12468,12750,14696,16666,17629,17884,21396,21519,22652,23814,23953,25785,29239,29301,29634,31378,33144,34146,39200,40738,41931,44117,45482,47158,49267,49451,50100,51976,59397,61374,62246,62906,66317,67660,70946,73962,75261,77521,79372,84899,89014,89367,98703],[4.285842748148867,6.47262190663536,7.582626555898267,2.46590307192553,2.277695137518903,3.33541110925635,2.2851808721129196,7.145945798528656,1.1549410678638552,4.75597061238821,1.5952892190535581,4.623238242039732,4.463072142520263,5.291996793273972,5.643361710857637,1.9986615590739547,2.982906848296168,3.039059687919587,4.56612818460028,8.235815424156609,2.971860544991885,5.657369001288353,3.157648725537454,7.960887147767399,5.762884965415325,2.7933312180833436,6.358475025357704,3.738654761997921,3.005126992034686,10.524670324338754,4.054997263930682,8.302021673021613,5.078651601643365,2.535048243622168,5.084901136429964,3.2768993107965976,2.22971957520593,1.813656290509955,5.133594239564322,4.2874116080873215,2.6795865609728504,7.099026726539364,6.062600981991335,7.884531328852753,5.918288751164365,5.388426353754723])                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
     * |0.0  |(100000,[4596,4744,6811,7254,12468,12750,26441,29634,40738,44883,45036,49567,55433,60403,62866,62906,66974,72243,72600,75261,92908,97004],[2.534673401192805,4.940946199176071,3.8148672171328206,7.045296777775496,0.8966920261547382,0.759231712506301,6.89188033687586,3.039059687919587,1.578824362768727,12.465103640360402,6.649374174616034,7.057364384900611,5.356686007368117,4.0805631561878215,7.944683089512676,2.542450568214982,4.750165834296318,3.485926656153431,3.4539204714198366,4.2874116080873215,7.228833458334424,4.479629106882936])                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 |
     * |0.0  |(100000,[3784,4596,6449,10708,11281,12468,12750,13239,14338,16862,17884,32919,37549,62246,62906,67660,70946,79365],[1.1100788971850364,2.534673401192805,4.216461495369126,6.454090774757882,2.3190090418870826,0.22417300653868455,1.518463425012602,5.643706637564133,10.1624074170312,5.436512979273983,1.1549410678638552,5.550866680289126,5.127879741587318,2.535048243622168,5.084901136429964,2.22971957520593,1.813656290509955,4.875109403849191])                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
     * |0.0  |(100000,[2553,12750,23384,292
     */


    // 把样本划分成两部分（训练集 80%，测试集 20%）
    tfIdfDF.cache()
    val array: Array[DataFrame] = tfIdfDF.randomSplit(Array(0.8, 0.2))

    val train = array(0)  // 训练集
    val test = array(1)   // 测试集

    // 调用朴素贝叶斯算法来训练模型
    // 构造一个朴素贝叶斯算法
    val naiveBayes = new NaiveBayes()
        .setLabelCol("label")
        .setFeaturesCol("tfidf_vec")
        .setSmoothing(1.0)  // 拉普拉斯平滑系数

    // 训练贝叶斯模型
    val bayesModel = naiveBayes.fit(train)

    // 保存模型
    bayesModel.save("profile/data/bayemodel")


    // 加载模型
    val bayesModel_load = NaiveBayesModel.load("profile/data/bayemodel")

    // 再用训练好的模型，对测试数据集进行预测（分类）
    val result = bayesModel_load.transform(test).drop("tfidf_vec")


    //result.show(100,false)


    val correct = result.where("label=prediction").count()
    val total = result.count()
    println(correct.toDouble/total.toDouble)


    spark.close()
  }

}
